CCR6- & CCR6+(Naive Cells) and CCR6- & CCR6+ (Memory Cells)
PI Name: Dr.Josuah Farber, NIAID Staff Scientist: Dr. Satya Singh, NIAID Bioinformatics Analyst: Arun Boddapati, NCBR, FNLCR
Comparing the progression of Naive Cells to Memory Cells by RNA-seq experiments:
The project examines the progression of CCR6- and CCR6+ Naive cells to CCR6+ Memory cells, to find markers that are indictive of CCR6+ Naive cells to CCR6- Memory Cells to CCR6+ Memory cells.
Methods:
RNA-seq analysis was performed using ccbr pipeliner (https://github.com/CCBR/Pipeliner) 1. Quality analysis 2. Trimming of adapters 3. Removing over-represented sequences 4. Alignment 5. Quantification 6. Quality control is available in multiQCReport.html
CPM based filtering is used to remove genes having expression less than 10-15 read counts across atleast 4 samples Filtering the genes using cpm filtering for only Naive Samples (Paired)
#CPM based filtering step
y <- DGEList(Read_input,group = TS )
keep = rowSums(cpm(y)>1)>=2
table(keep)
## keep
## FALSE TRUE
## 45380 13490
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
DT::datatable(y$samples,
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))
Plotting the libary sizes and unnormalized counts for Naive Samples (Paired) Here the Barplot shows library sizes of CCR6 -/+ Samples from RNA-Seq Experiments and boxplot of unnormalized counts
#Barplot showing library sizes
par(mfrow=c(1,2))
barplot(y$samples$lib.size,names=colnames(y),las=2)
title("Library Sizes")
logcounts <- cpm(y,log=TRUE, prior.count = 0.5)
logcounts[is.na(logcounts)] = 0
logcounts[logcounts<0] = 0
#Boxplot showing unnormalized logCPM counts
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2, outline = F,col = "red")
abline(h=median(logcounts),col="blue")
title("Unnormalized logCPMs")
ab = data.frame(logcounts)
colnames(ab) = pheno$Samples
setDT(ab, keep.rownames = T)
ab = melt(ab, id = "rn")
p = ggplot(ab, aes(x=value, colour = variable))+ geom_density(alpha=0.25)
p <- ggplotly(p)
p
#Voom Normalization function
v <- voom(y,design,plot = TRUE,normalize.method = "quantile")
ab = data.frame(v)
colnames(ab) = pheno$Samples
setDT(ab, keep.rownames = T)
ab = melt(ab, id = "rn")
p = ggplot(ab, aes(x=value, colour = variable))+ geom_density(alpha=0.25)
p <- ggplotly(p)
p
#Heatmap function
d=Dist(t(v$E),method="euclidean",diag=TRUE)
m=as.matrix(d)
new.palette=colorRampPalette(c("black","red","yellow","white"),space="rgb")
heatmap(m,symm=TRUE,col=new.palette(20),cexRow= 0.5, cexCol = 0.5)
par(mfrow=c(1,2))
#Boxplot for Normalized and Unnormalized logCPM
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2,main="Unnormalised logCPM", outline=F,col = "red", cex.axis = 0.5, cex.lab=1)
abline(h=median(logcounts),col="blue")
boxplot(v$E, xlab="", ylab="Log2 counts per million",las=2,main="Voom transformed CPM", outline=F,col = "green", cex.axis = 0.5, cex.lab=1)
abline(h=median(v$E),col="blue")
fit <- lmFit(v$E, design)
cont.matrix <- makeContrasts(MNvsNN = Memory_Negative-Naive_Negative,
MPvsNN = Memory_Positive-Naive_Negative,
NPvsNN = Naive_Positive-Naive_Negative,
MNvsNP = Memory_Negative-Naive_Positive,
MPvsNP = Memory_Positive-Naive_Positive,
MPvsMN = Memory_Positive-Memory_Negative,
levels=design)
#View(cont.matrix)
fit <- contrasts.fit(fit, cont.matrix)
fit_final<- eBayes(fit)
MNvsNN= topTable(fit_final, coef="MNvsNN", n=Inf)
MPvsNN = topTable(fit_final, coef = "MPvsNN", n=Inf)
NPvsNN = topTable(fit_final, coef = "NPvsNN", n=Inf)
MNvsNP = topTable(fit_final, coef="MNvsNP", n=Inf)
MPvsNP = topTable(fit_final, coef="MPvsNP", n=Inf)
MPvsMN = topTable(fit_final, coef = "MPvsMN", n = Inf)
summa.fit <- decideTests(fit_final)
summary(summa.fit)
## MNvsNN MPvsNN NPvsNN MNvsNP MPvsNP MPvsMN
## Down 1940 1931 142 225 390 4
## NotSig 9953 9865 12992 12973 12389 13461
## Up 1597 1694 356 292 711 25
MNvsNNsubset = subset(MNvsNN, MNvsNN$adj.P.Val < 0.05 & abs(MNvsNN$logFC)> 2)
MPvsNNsubset = subset(MPvsNN, MPvsNN$adj.P.Val < 0.05 & abs(MPvsNN$logFC)> 2)
NPvsNNsubset = subset(NPvsNN, NPvsNN$adj.P.Val < 0.05 & abs(NPvsNN$logFC)> 2)
MNvsNPsubset = subset(MNvsNP, MNvsNP$adj.P.Val < 0.05 & abs(MNvsNP$logFC)> 2)
MPvsNPsubset = subset(MPvsNP, MPvsNP$adj.P.Val < 0.05 & abs(MPvsNP$logFC)> 2)
MPvsMNsubset = subset(MPvsMN, MPvsMN$adj.P.Val < 0.05 & abs(MPvsMN$logFC)> 2)
#write.table(MPvsMN, "../Results/LimmaResults/MPvsMN_DE.txt", sep = "\t", quote = F, row.names = T, col.names = T)
library(data.table)
setDT(MNvsNN, keep.rownames = T, check.names = T)
MNvsNN = MNvsNN[,c(1,2,5,6)]
colnames(MNvsNN) = c("GeneName","logFC_MNvsNN","P-val_MNvsNN","FDR_MNvsNN")
setDT(MPvsNN, keep.rownames = T, check.names = T)
MPvsNN = MPvsNN[,c(1,2,5,6)]
colnames(MPvsNN) = c("GeneName","logFC_MPvsNN","P-val_MPvsNN","FDR_MPvsNN")
setDT(NPvsNN, keep.rownames = T, check.names = T)
NPvsNN = NPvsNN[,c(1,2,5,6)]
colnames(NPvsNN) = c("GeneName","logFC_NPvsNN","P-val_NPvsNN","FDR_NPvsNN")
setDT(MNvsNP, keep.rownames = T, check.names = T)
MNvsNP = MNvsNP[,c(1,2,5,6)]
colnames(MNvsNP) = c("GeneName","logFC_MNvsNP","P-val_MNvsNP","FDR_MNvsNP")
setDT(MPvsNP, keep.rownames = T, check.names = T)
MPvsNP = MPvsNP[,c(1,2,5,6)]
colnames(MPvsNP) = c("GeneName","logFC_MPvsNP","P-val_MPvsNP","FDR_MPvsNP")
setDT(MPvsMN, keep.rownames = T, check.names = T)
MPvsMN = MPvsMN[,c(1,2,5,6)]
colnames(MPvsMN) = c("GeneName","logFC_MPvsMN","P-val_MPvsMN","FDR_MPvsMN")
df_merged = Reduce(function(x, y) merge(x, y, all = T, by = "GeneName"), list(MNvsNN,MPvsNN,NPvsNN,MNvsNP,MPvsNP,MPvsMN))
#df_merged = round_df(df_merged, digits=7)
DT::datatable(df_merged,
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
mergedset = merge(df_merged, fpkm, by = "GeneName", all = F )
write.csv(mergedset, "../Results-New/DifferentiallyExpressionTable_withNormalizedExpressionvalues.csv", row.names = F, col.names = T,quote = F)
subsetresult1 = data.frame(subset(MNvsNN, MNvsNN$FDR_MNvsNN < 0.05 & abs(MNvsNN$logFC_MNvsNN)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
#colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch1 = data.frame(removebatch)
# setDT(removebatch1, keep.rownames = T, check.names = T)
# colnames(removebatch1)[1] = c("GeneName")
# removebatch1 = removebatch1[,c(1,2,3,6,7)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F )
mergedset = mergedset[,c(1,9,10,7,12)]
#write.csv(mergedset, "DifferentialExpressionTable_WithNormalizedExpressionVlaues.csv", row.names = F, col.names = T,quote = F)
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_MNvsNN_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
subsetresult1 = data.frame(subset(MPvsNN, MPvsNN$FDR_MPvsNN < 0.05 & abs(MPvsNN$logFC_MPvsNN)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch2 = data.frame(removebatch)
# setDT(removebatch2, keep.rownames = T, check.names = T)
# colnames(removebatch2)[1] = c("GeneName")
# removebatch2 = removebatch2[,c(1, 4:7)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F)
mergedset = mergedset[,c(1, 8,11,7,12)]
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_MPvsNN_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
subsetresult1 = data.frame(subset(NPvsNN, NPvsNN$FDR_NPvsNN < 0.05 & abs(NPvsNN$logFC_NPvsNN)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch3 = data.frame(removebatch)
# setDT(removebatch3, keep.rownames = T, check.names = T)
# colnames(removebatch3)[1] = c("GeneName")
# removebatch3 = removebatch3[,c(1, 8,9,6,7)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F)
mergedset = mergedset[,c(1,5:7,12 )]
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_NPvsNN_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
subsetresult1 = data.frame(subset(MPvsNP, MPvsNP$FDR_MPvsNP < 0.05 & abs(MPvsNP$logFC_MPvsNP)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch4 = data.frame(removebatch)
# setDT(removebatch4, keep.rownames = T, check.names = T)
# colnames(removebatch4)[1] = c("GeneName")
# removebatch4 = removebatch4[,c(1, 4,5,8,9)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F)
mergedset = mergedset[,c(1, 8,11,5,6)]
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_MPvsNP_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
subsetresult1 = data.frame(subset(MNvsNP, MNvsNP$FDR_MNvsNP< 0.05 & abs(MNvsNP$logFC_MNvsNP)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch4 = data.
# removebatch5 = data.frame(removebatch)
# setDT(removebatch5, keep.rownames = T, check.names = T)
# colnames(removebatch5)[1] = c("GeneName")
# removebatch5 = removebatch5[,c(1, 2,3,8,9)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F)
mergedset = mergedset[,c(1, 9,10,5,6)]
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_MNvsNP_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
subsetresult1 = data.frame(subset(MPvsMN, MPvsMN$FDR_MPvsMN < 0.05 & abs(MPvsMN$logFC_MPvsMN)>2))
#setDT(subsetresult1, keep.rownames = T, check.names = T)
colnames(subsetresult1)[1] = c("GeneName")
fpkm = read.table("../Rawdata/RSEM.genes.FPKM.all_samples.txt",header = T, sep = "\t")
fpkm$GeneName = paste0(fpkm$gene_id,"|", fpkm$GeneName)
colnames(fpkm)[2] = c("GeneName")
fpkm = fpkm[,c(2:10)]
# removebatch4 = data.
# removebatch6 = data.frame(removebatch)
# setDT(removebatch6, keep.rownames = T, check.names = T)
# colnames(removebatch6)[1] = c("GeneName")
# removebatch6 = removebatch6[,c(1:5)]
mergedset = merge(subsetresult1, fpkm, by = "GeneName", all = F)
mergedset = mergedset[,c(1, 8,11,9,10)]
mergedset = as.matrix(mergedset)
rownames(mergedset) = mergedset[,1]
mergedset = mergedset[,-1]
mergedset = data.frame(mergedset)
#mergedset[is.na(mergedset)] = 0
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.character(x))
mergedset[1:4] = lapply(mergedset[1:4],function(x) as.numeric(x))
mergedset = as.matrix(mergedset)
mergedset = mergedset[rowSums(mergedset)>0,]
col = colorRampPalette( c("blue","white","red") )
#pdf("Heatmap_MPvsMN_TopDE_FDR<0.05.pdf", height = 20, width = 10)
heatmap.2(mergedset,
scale = "row",
trace = "none",
col = col ,
Colv = F ,
Rowv = T,
dendrogram = "row",
distfun = function(x) as.dist(1-cor(t(x))) ,
hclustfun = function(x) hclust(x,method="average"),
cexRow = 0.3 , cexCol = 0.7,main = "Differentially expressed Genes" )
#dev.off()
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.0.0 forestmangr_0.9.1 rmarkdown_1.14
## [4] ggdendro_0.1-20 dendextend_1.12.0 tidyr_0.8.3
## [7] mixOmics_6.8.0 lattice_0.20-38 MASS_7.3-51.4
## [10] gplots_3.0.1.1 vivlib_1.0.0 dplyr_0.8.3
## [13] data.table_1.12.2 NCBR.RTools_1.0 splitstackshape_1.4.8
## [16] plotly_4.9.0 amap_0.8-17 RColorBrewer_1.1-2
## [19] reshape2_1.4.3 kableExtra_1.1.0 knitr_1.24
## [22] ggfortify_0.4.7 ggplot2_3.2.1 edgeR_3.26.7
## [25] limma_3.40.6
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 matrixStats_0.54.0 webshot_0.5.1
## [4] httr_1.4.1 tools_3.6.1 backports_1.1.5
## [7] R6_2.4.0 DT_0.8 KernSmooth_2.23-15
## [10] lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2
## [13] tidyselect_0.2.5 gridExtra_2.3 compiler_3.6.1
## [16] rvest_0.3.4 Cairo_1.5-10 xml2_1.2.2
## [19] labeling_0.3 caTools_1.17.1.2 readr_1.3.1
## [22] stringr_1.4.0 digest_0.6.21 pkgconfig_2.0.3
## [25] htmltools_0.3.6 htmlwidgets_1.3 rlang_0.4.0
## [28] rstudioapi_0.10 shiny_1.3.2 jsonlite_1.6
## [31] crosstalk_1.0.0 gtools_3.8.1 RCurl_1.95-4.12
## [34] magrittr_1.5 Matrix_1.2-17 Rcpp_1.0.2
## [37] munsell_0.5.0 viridis_0.5.1 stringi_1.4.3
## [40] yaml_2.2.0 plyr_1.8.4 grid_3.6.1
## [43] parallel_3.6.1 gdata_2.18.0 promises_1.0.1
## [46] crayon_1.3.4 hms_0.5.1 locfit_1.5-9.1
## [49] zeallot_0.1.0 pillar_1.4.2 igraph_1.2.4.1
## [52] corpcor_1.6.9 glue_1.3.1 evaluate_0.14
## [55] vctrs_0.2.0 httpuv_1.5.1 gtable_0.3.0
## [58] purrr_0.3.2 assertthat_0.2.1 xfun_0.8
## [61] mime_0.7 xtable_1.8-4 RSpectra_0.15-0
## [64] later_0.8.0 viridisLite_0.3.0 rARPACK_0.11-0
## [67] tibble_2.1.3 ellipse_0.4.1